########################################################
## This code  takes models trained on years X and 
## makes predictions for individuals from year Y
########################################################

########################################################
## Preparation of the workspace
########################################################


## remove all objects from the current workspace
rm(list=ls())

## load the required packages
library(haven)
library(caret)
library(randomForest)
library(doParallel)
library(mice)
library(plyr)
library(dplyr)
library(VIM)
library(base)
library(ranger)
library(glmnet)
library(xgboost)

## display the current time -> to check how much time it takes to run the code
start_time = Sys.time()

## set the directories
main <- "placeholder_main"

dataDirectory <- paste0(main, "/Data")
RDirectory <- paste0(main, "/Programs/Output Generation")

## Load the tuning and training functions:
source(paste0(RDirectory, "/102_0_caret_parameter_tuning_Function.R"))
source(paste0(RDirectory, "/103_0_caret_predictions_Function.R"))

########################################################
## Define the locals for the loop
########################################################
  
# Define the model that is used for the predictions (the name of the model is a part of the name of the .dta file with data)
model_list <- c("Full")

# Define the model of the individuals (the name of the model is a part of the name of the .dta file with data)
model_indiv <- c("Full")

# Define the outcome variables that are predicted
dependent_variable <- c("emplAft6M_0M_In", "emplAft6M_6M_In", "emplAft6M_12M_In")

# Define the individuals that should be used for predictions
years_indiv <- c(1992:2016)

## we define the years for which we want to run the code (i.e., the years we used to train the model), the years may differ depending on the model
years <- c(1992:2016)


########################################################
## Running the loop
########################################################  
      
for(model in model_list){
    
  for(y in years){
      
    for(dependent in dependent_variable){
        
      for(y_indiv in years_indiv) {
        
        print(paste("Sample:", y_indiv, "; Model:", y, "; Outcome:", dependent,"Start")) ## display the time when loop starts for a year and y variable
        
        ########################################################
        ## Load the dataset
        ##################################################
        data <- read_dta(paste0(dataDirectory, "/002_DataForR_", model_indiv, "_", y_indiv,".dta"))
        
        ########################################################
        ## Define the sub-sample for creation of predictions
        ########################################################
        
        ## Keeping only 30% of sample for creation of predictions
        n_parametertuning <- round(nrow(data)*0.1, digits=0) + 1
        n_training <- round((nrow(data)*0.3), digits=0)
        n_training_plus_tuning <- n_training + n_parametertuning
        n_training_plus_tuning_1 <- n_training_plus_tuning + 1
        
        first_column <- which(colnames(data)=="Gender") ## Identify column number where the variables of interest start
        
        ## Keeping only those observations with non-missing outcome variables
        y_column <- which(colnames(data) == dependent) ## Identify column number with the dependent variable of interest
        data <- data[complete.cases(data[, y_column]), ] ## Restrict the dataset to observations with non-missing dependent variable
        
        ## Creating dataset for training and predictions
        data_pred <- data[data$n_order >= n_training_plus_tuning_1,] ## Data for predictions (hold-out sample)
        
        total_y <- factor(data_pred[[dependent]]) ## declare the y variable as a factor/category
        total_x <- data_pred[, first_column:ncol(data_pred)] ## Keeping all the possible covariates we want to have in the model
        total_final <- as.data.frame(cbind(total_y, total_x)) ## Creating final dataset used for predictions
        
        persinfo <- as.data.frame(cbind(data_pred$LopNr_PersonNr, data_pred$n)) ## Creating a dataset with individual ID and n for the predictions
        
        ## Correcting format of outcome variables
        levels(total_final$total_y) <- c("no", "yes")
        
        ## Removes the initial dataset that was loaded and the other intermediate datasets
        rm("data")
        rm("data_pred")
        rm("total_final")
        
        ########################################################
        ## Load models
        ########################################################
        
        load(file=paste0(dataDirectory,"/103_Models_", model, "_", dependent, "_", y, ".rda"))
        
        ########################################################
        ## Create predictions
        ########################################################
        
        ## Creating predictions Random Forest
        prob_rf <- predict(models$rff_final, total_x)
        prob_rf <- prob_rf[["predictions"]]
        prob_rf = data.frame(prob_rf[,2])
        
        ## Creating predictions Gradient Boost
        total_x_boost = data.matrix(total_x)
        prob_boost <- predict(models$rboost_final, total_x_boost)
        rm("total_x_boost") ## remove the data format that is specific gradient boost
        
        ## Creating predictions LASSO
        total_x_lasso = data.matrix(total_x)
        prob_lasso <- predict(models$rlasso_final, total_x_lasso, type = "response")
        rm("total_x_lasso") ## remove the data format that is specific to lasso
        
        ########################################################
        ## Save predictions
        ########################################################
        output <- cbind(total_y, prob_rf, prob_boost, prob_lasso, persinfo) ## put all the predictions together, with personal ID, n and outcome variable
        write.csv(output, file=paste0(dataDirectory,"/103_predictionsR_", model,"_", dependent, "_", y_indiv,"Individuals_TrainedOn", y,"modelIndividuals_", model_indiv, ".csv"))
        rm("output") ## remove the dataset with output
        
        print(paste(dependent,"End")) ## display the time when loop ends for a year and y variable
        
      }
    }
  }
}
